Rmarkdown to pre-process scRNA seq from fibril stimulated PBMCs.

Create the Seurat Object.

.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))

library(sctransform, lib.loc = "/hpc/users/richaa21/.Rlib")
library(Seurat, lib.loc = "/hpc/users/richaa21/.Rlib")
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built under R 4.2.0 but the current version is
## 4.3.0; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed
## 'SeuratObject' was built with package 'Matrix' 1.6.4 but the current
## version is 1.6.5; it is recomended that you reinstall 'SeuratObject' as
## the ABI for 'Matrix' may have changed
## 
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
## 
##     intersect
library(patchwork)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
pbmc.data <- Read10X(data.dir = "/sc/arion/projects/ad-omics/ashley/data/aF/outs/per_sample_outs/aF/count/sample_filtered_feature_bc_matrix")

pbmc.f <- CreateSeuratObject(counts = pbmc.data, project = "fibril", min.cells = 3, min.features = 200)
pbmc.f
## An object of class Seurat 
## 23791 features across 18327 samples within 1 assay 
## Active assay: RNA (23791 features, 0 variable features)
##  1 layer present: counts

Lets check how many cells and features we are starting with.

length(colnames(pbmc.f)) #number of cells
## [1] 18327
length(rownames(pbmc.f)) #number of features 
## [1] 23791

Quality Control (QC)

a. QC mitochondiral genes. The PercentFeatureSet() calculates the % of counts originating from a set of features - here we are first looking at mitochondiral features, which are genes starting with MT. 
b. By using [[ ]], I am adding columns to the pbmc matrix to store this QC data. 
pbmc.f[["percent.mt"]] <- PercentageFeatureSet(pbmc.f, pattern = "^MT-")

# Show QC metrics for the first 5 cells
head(pbmc.f@meta.data, 5)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACCAT-1     fibril       7898         2915   4.304887
## AAACCTGAGAAGGACA-1     fibril      10204         3359   5.556644
## AAACCTGAGAATCTCC-1     fibril      15411         4590   2.602038
## AAACCTGAGAATGTGT-1     fibril       8500         2962   4.141176
## AAACCTGAGACGCACA-1     fibril      30572         6417   3.185922
library(farver, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library")
VlnPlot(pbmc.f, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(pbmc.f, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc.f, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 

plot2

# I will select for %mt under 10% and features greater than 200 to capture alive healthy cells. 
pbmc.f <- subset(pbmc.f, subset = nFeature_RNA > 200 & percent.mt < 10)
head(pbmc.f@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACCAT-1     fibril       7898         2915   4.304887
## AAACCTGAGAAGGACA-1     fibril      10204         3359   5.556644
## AAACCTGAGAATCTCC-1     fibril      15411         4590   2.602038
## AAACCTGAGAATGTGT-1     fibril       8500         2962   4.141176
## AAACCTGAGACGCACA-1     fibril      30572         6417   3.185922
## AAACCTGAGCTGAAAT-1     fibril       7030         2652   3.911807

Normalize the data.

The data is normalized based on the feature (number of genes in a cell) by the total expression. This number is multiplied by 10,000 and then log transformed. The function to do this is “NormalizeData.” The values specied below are the default values of this function.

pbmc.f <- NormalizeData(pbmc.f, normalization.method = "LogNormalize", scale.factor = 10000)
## Normalizing layer: counts

Identification of highly variable features

pbmc.f <- FindVariableFeatures(pbmc.f, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc.f), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc.f)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
plot1 

plot2

## Scaling the Data The data is scaled by doing a linear transformation. The ScaleData function does this by shifting the expression of genes so that the mean expression becomes 0 and the variance is 1. By default, only variable genes are scaled. This is changed by features = all.genes

##Scaling RNA data, we only scale the variable features here for efficiency
all.genes <- rownames(pbmc.f)
pbmc.f <- ScaleData(pbmc.f, vars.to.regress = c("percent.mt"))
## Regressing out percent.mt
## Centering and scaling data matrix

Perform linear dimensional reduction

For the first principal components, RunPCA shows the most positive (correlated) and most negative (anticorrelated) genes

pbmc.f <- RunPCA(pbmc.f, features = VariableFeatures(object = pbmc.f))
## PC_ 1 
## Positive:  LTB, TSC22D3, IL7R, FP236383.3, TRIM22, TCF7, AQP3, IFITM1, TNFSF8, LEF1 
##     PIM2, CCR7, JAML, C1orf162, LINC00861, CORO1B, NEAT1, MAL, MYC, CD5 
##     NCDN, BEX3, TIMP1, CD4, SESN3, S100A6, BBC3, HERPUD1, AC139720.1, CYSLTR1 
## Negative:  RRM2, TYMS, MKI67, TUBA1B, STMN1, PCLAF, CDK1, UBE2C, TOP2A, KIFC1 
##     NUSAP1, ASF1B, ZWINT, TUBB, CCNA2, ASPM, CENPF, GTSE1, PKMYT1, HIST1H1B 
##     KNL1, CDCA8, CDKN3, SPC25, GGH, TPX2, CKAP2L, BIRC5, CENPU, CDCA2 
## PC_ 2 
## Positive:  CTSW, PRF1, GZMB, CCL4, NKG7, CCL3, KLRD1, TYROBP, GZMA, KLRC1 
##     FCER1G, AOAH, CST7, IL2RB, CCL5, SRGN, TRDC, GNLY, HOPX, KLRK1 
##     FCGR3A, GZMK, NCAM1, GSTP1, KIR2DL4, LYST, EOMES, KLRC2, CD38, SH2D1B 
## Negative:  IL7R, TIMP1, AQP3, S100A6, LIME1, COTL1, S100A10, S100A4, CD5, TSC22D3 
##     SNED1, IFITM1, GPR183, WDR86-AS1, CD4, ANP32B, CORO1B, LEF1, ITGB1, LMNA 
##     MYC, CRIP1, STAT1, PLP2, PRDX1, SOS1, INPP4B, RGCC, UBXN11, IL9R 
## PC_ 3 
## Positive:  PLK1, CLIC3, PLEK, TYROBP, FCER1G, CEBPD, FGFBP2, CENPA, CX3CR1, CCNB1 
##     DLGAP5, FCGR3A, ASPM, KLRD1, KIF20A, PSRC1, FGR, HMMR, CCNB2, CDKN2D 
##     CD300A, PRF1, GZMK, SH2D1B, KIF14, CENPF, PIMREG, TROAP, NEK2, TRDC 
## Negative:  GINS2, HSP90AB1, MCM2, MCM6, CSF2, MCM5, TRBV6-2, PLPP1, PAICS, CDC6 
##     TRAV12-2, RANBP1, MCM7, UNG, MCM3, MCM4, HSPE1, NME1, MIF, CDC45 
##     SRM, HSPD1, DCTPP1, E2F1, CD82, TCTN3, IL2RA, TPI1, PHLDA1, DUSP4 
## PC_ 4 
## Positive:  TRBV6-2, CD27, TRAV12-2, LRRN3, PLPP1, CD8A, PECAM1, CD8B, MYO1E, SNX9 
##     SIRPG, REG4, RASGRF2, BCL2L11, CBLB, BACH2, CSF2, LAIR2, AL136962.1, PDE3B 
##     APBB2, BHLHE40-AS1, CRTAM, LYST, DUSP2, TRBV5-4, CCR7, GNG8, ITGA1, TCF7 
## Negative:  HLA-DRB1, HLA-DPA1, S100A4, LGALS1, S100A6, S100A10, HLA-DPB1, HLA-DRA, HLA-DQA1, PLEK 
##     ANXA2, TXN, HLA-DQB1, LGALS3, HLA-DRB5, CRIP1, CD74, TMSB10, PFN1, CD300A 
##     TAGLN2, FTL, TYROBP, CLU, GINS2, TBXAS1, IFITM1, LIME1, TIMP1, EFHD2 
## PC_ 5 
## Positive:  LGALS1, S100A4, S100A6, ANXA2, C12orf75, ACTB, S100A10, TXN, HOPX, PFN1 
##     GAPDH, ACTG1, TMSB10, CFL1, CD74, LAG3, LINC02694, PRDX1, HLA-DPA1, VIM 
##     PHLDA1, FLNA, HLA-DRB1, JPT1, AHNAK, HLA-DRA, THEMIS, ALOX5AP, CD82, TIMP1 
## Negative:  CCR7, TCF7, LEF1, TRIM22, SESN3, TYROBP, CXCR4, FAM111B, FCER1G, PTMA 
##     TXK, RNF130, PLAC8, MCM7, DHRS3, PTGDR, DTL, CYSLTR1, CDC6, CCNE2 
##     CLSPN, SELL, TK1, MYBL2, SH2D1B, FEN1, LINC00861, FHIT, KCNQ5, NCAM1
print(pbmc.f[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  LTB, TSC22D3, IL7R, FP236383.3, TRIM22 
## Negative:  RRM2, TYMS, MKI67, TUBA1B, STMN1 
## PC_ 2 
## Positive:  CTSW, PRF1, GZMB, CCL4, NKG7 
## Negative:  IL7R, TIMP1, AQP3, S100A6, LIME1 
## PC_ 3 
## Positive:  PLK1, CLIC3, PLEK, TYROBP, FCER1G 
## Negative:  GINS2, HSP90AB1, MCM2, MCM6, CSF2 
## PC_ 4 
## Positive:  TRBV6-2, CD27, TRAV12-2, LRRN3, PLPP1 
## Negative:  HLA-DRB1, HLA-DPA1, S100A4, LGALS1, S100A6 
## PC_ 5 
## Positive:  LGALS1, S100A4, S100A6, ANXA2, C12orf75 
## Negative:  CCR7, TCF7, LEF1, TRIM22, SESN3

The PCA results can be visualized in different ways.

VizDimLoadings(pbmc.f, dims = 1:2, reduction = "pca")

DimPlot(pbmc.f, reduction = "pca") + NoLegend()

DimHeatmap(pbmc.f, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc.f, dims = 1:15, cells = 500, balanced = TRUE)

Determine the ‘dimensionality’ of the dataset

Cells will be clustered based on PCA. How many PC to use is dependent on many factors. For example, if trying to analyze a rare cell subset, you might want to add more PCs. Usually, the first 10 is good to see dimensionality of the data.

ElbowPlot(pbmc.f)

Cluster the cells.

##We select the top 15 PCs for clustering and tSNE based on PCElbowPlot
pbmc.f <- FindNeighbors(pbmc.f, reduction = "pca", dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
pbmc.f <- FindClusters(pbmc.f, resolution = 0.5, verbose = FALSE)
head(Idents(pbmc.f), 5)
## AAACCTGAGAAACCAT-1 AAACCTGAGAAGGACA-1 AAACCTGAGAATCTCC-1 AAACCTGAGAATGTGT-1 
##                  4                  1                  9                  1 
## AAACCTGAGACGCACA-1 
##                  7 
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12

Run non-linear dimensional reduction (UMAP/tSNE)

pbmc.f <- RunUMAP(pbmc.f, dims = 1:15)
## 10:42:02 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:42:02 Read 17726 rows and found 15 numeric columns
## 10:42:02 Using Annoy for neighbor search, n_neighbors = 30
## 10:42:02 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:42:04 Writing NN index file to temp file /tmp/RtmpRlAiI7/filed09935ef3302
## 10:42:04 Searching Annoy index using 1 thread, search_k = 3000
## 10:42:09 Annoy recall = 100%
## 10:42:10 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:42:11 Initializing from normalized Laplacian + noise (using RSpectra)
## 10:42:11 Commencing optimization for 200 epochs, with 755922 positive edges
## 10:42:33 Optimization finished
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc.f, reduction = "umap", label = TRUE)

pbmc.f <- RunTSNE(pbmc.f, reduction = "pca", dims = 1:15)
DimPlot(pbmc.f, reduction = "tsne", label = TRUE)

Lets check our metadata now of the seurat object to see what has been added.

head(pbmc.f@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACCAT-1     fibril       7898         2915   4.304887
## AAACCTGAGAAGGACA-1     fibril      10204         3359   5.556644
## AAACCTGAGAATCTCC-1     fibril      15411         4590   2.602038
## AAACCTGAGAATGTGT-1     fibril       8500         2962   4.141176
## AAACCTGAGACGCACA-1     fibril      30572         6417   3.185922
## AAACCTGAGCTGAAAT-1     fibril       7030         2652   3.911807
##                    RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGAAACCAT-1               4               4
## AAACCTGAGAAGGACA-1               1               1
## AAACCTGAGAATCTCC-1               9               9
## AAACCTGAGAATGTGT-1               1               1
## AAACCTGAGACGCACA-1               7               7
## AAACCTGAGCTGAAAT-1               0               0
#We now have seurat clusters and RNA_snn_res.0.5 columns added. 

Finding differentially expressed features (cluster biomarkers)

# find markers for every cluster compared to all remaining cells, report only the positive
# ones
markers_f <- FindAllMarkers(pbmc.f, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12

VlnPlot will display the differential expression across the clusters. For example, I am looking here at CD8A and CD4 expression in the clusters.

VlnPlot(pbmc.f, features = c("CD8A", "CD4"))

The raw counts can also be shown instead by adding some parameters.

VlnPlot(pbmc.f, features = c("CD8A", "CD4"), slot = "counts", log = TRUE)

FeaturePlot(pbmc.f, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
    "CD8A"))

Combine Seurat object with demuxlet output.

First, load and edit the .best demuxlet output file to make it more compatible.

.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))

library(patchwork)
library(tidyr)
f_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/aF.best", header = T, stringsAsFactors = F, check.names = F)
head(f_demuxlet)
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP                    BEST
## 1 AAACCTGAGAAACCAT-1   20230    2729    2289  1191 SNG-GSA8_0_NYUMD0334-01
## 2 AAACCTGAGAAGGACA-1   28220    3441    2915  1485 SNG-GSA8_0_NYUMD0289-01
## 3 AAACCTGAGAATCTCC-1   43192    6406    5064  2716 SNG-GSA8_0_NYUMD0327-01
## 4 AAACCTGAGAATGTGT-1   23164    3096    2623  1317 SNG-GSA8_0_NYUMD0334-01
## 5 AAACCTGAGACGCACA-1   80482   12257   10037  4265 SNG-GSA8_0_NYUMD0334-01
## 6 AAACCTGAGCTGAAAT-1   18804    2299    1930  1096 SNG-GSA8_0_NYUMD0289-01
##               SNG.1ST   SNG.LLK1             SNG.2ND  SNG.LLK2   SNG.LLK0
## 1 GSA8_0_NYUMD0334-01  -436.0108 GSA8_0_NYUMD0289-01 -1144.530  -700.9854
## 2 GSA8_0_NYUMD0289-01  -551.2455 GSA8_0_NYUMD0327-01 -1348.805  -849.1844
## 3 GSA8_0_NYUMD0327-01  -978.9799 GSA8_0_NYUMD0334-01 -2477.158 -1538.9178
## 4 GSA8_0_NYUMD0334-01  -501.0333 GSA8_0_NYUMD0315-01 -1271.823  -786.8994
## 5 GSA8_0_NYUMD0334-01 -1514.3772 GSA8_0_NYUMD0289-01 -3999.847 -2477.8828
## 6 GSA8_0_NYUMD0289-01  -422.6331 GSA8_0_NYUMD0334-01 -1018.341  -640.8195
##               DBL.1ST             DBL.2ND ALPHA      LLK12       LLK1
## 1 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01   0.5  -650.5111 -1179.6639
## 2 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0327-01   0.5  -798.6921  -551.2455
## 3 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5 -1423.6142  -978.9799
## 4 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5  -735.1817 -1273.6766
## 5 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01   0.5 -2273.7404 -3999.8471
## 6 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01   0.5  -599.9499  -422.6331
##         LLK2      LLK10      LLK20      LLK00   PRB.DBL PRB.SNG1
## 1  -436.0108 -1007.9044  -658.1725  -717.9619  2.33e-94        1
## 2 -1348.8051  -555.0604  -798.6921  -875.2910 1.14e-108        1
## 3 -2477.1579 -1464.5035 -2200.8787 -1583.0830 2.63e-194        1
## 4  -501.0333 -1136.8613  -738.3541  -809.4175 8.21e-103        1
## 5 -1514.3772 -4005.0462 -2273.7404 -2537.6268  0.00e+00        1
## 6 -1018.3409  -424.8764  -599.9499  -659.8594  3.29e-78        1

To edit: I will split the Best column into multiple columns.

f_demuxlet_edit = f_demuxlet %>% 
  mutate(BEST = gsub("-01","", BEST)) %>%
  separate(BEST, into=c("DMX_classification.global","DMX_maxID","DMX_secondID"), sep="-") %>%
  separate(DMX_maxID, into=c("DMX_garbage1","DMX_garbage2","DMX_maxID"), sep ="_") %>%
  separate(DMX_secondID, into=c("DMX_garbage3","DMX_garbage4","DMX_secondID"), sep ="_") %>%
  select(-contains("garbage"))
## Warning: Expected 3 pieces. Additional pieces discarded in 2522 rows [13, 18,
## 20, 25, 27, 29, 35, 48, 63, 66, 69, 71, 79, 82, 100, 102, 103, 107, 116, 124,
## ...].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 15896 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 19, 21, 22, 23, ...].
head(f_demuxlet_edit)
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP DMX_classification.global
## 1 AAACCTGAGAAACCAT-1   20230    2729    2289  1191                       SNG
## 2 AAACCTGAGAAGGACA-1   28220    3441    2915  1485                       SNG
## 3 AAACCTGAGAATCTCC-1   43192    6406    5064  2716                       SNG
## 4 AAACCTGAGAATGTGT-1   23164    3096    2623  1317                       SNG
## 5 AAACCTGAGACGCACA-1   80482   12257   10037  4265                       SNG
## 6 AAACCTGAGCTGAAAT-1   18804    2299    1930  1096                       SNG
##   DMX_maxID DMX_secondID             SNG.1ST   SNG.LLK1             SNG.2ND
## 1 NYUMD0334         <NA> GSA8_0_NYUMD0334-01  -436.0108 GSA8_0_NYUMD0289-01
## 2 NYUMD0289         <NA> GSA8_0_NYUMD0289-01  -551.2455 GSA8_0_NYUMD0327-01
## 3 NYUMD0327         <NA> GSA8_0_NYUMD0327-01  -978.9799 GSA8_0_NYUMD0334-01
## 4 NYUMD0334         <NA> GSA8_0_NYUMD0334-01  -501.0333 GSA8_0_NYUMD0315-01
## 5 NYUMD0334         <NA> GSA8_0_NYUMD0334-01 -1514.3772 GSA8_0_NYUMD0289-01
## 6 NYUMD0289         <NA> GSA8_0_NYUMD0289-01  -422.6331 GSA8_0_NYUMD0334-01
##    SNG.LLK2   SNG.LLK0             DBL.1ST             DBL.2ND ALPHA      LLK12
## 1 -1144.530  -700.9854 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01   0.5  -650.5111
## 2 -1348.805  -849.1844 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0327-01   0.5  -798.6921
## 3 -2477.158 -1538.9178 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5 -1423.6142
## 4 -1271.823  -786.8994 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5  -735.1817
## 5 -3999.847 -2477.8828 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01   0.5 -2273.7404
## 6 -1018.341  -640.8195 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0334-01   0.5  -599.9499
##         LLK1       LLK2      LLK10      LLK20      LLK00   PRB.DBL PRB.SNG1
## 1 -1179.6639  -436.0108 -1007.9044  -658.1725  -717.9619  2.33e-94        1
## 2  -551.2455 -1348.8051  -555.0604  -798.6921  -875.2910 1.14e-108        1
## 3  -978.9799 -2477.1579 -1464.5035 -2200.8787 -1583.0830 2.63e-194        1
## 4 -1273.6766  -501.0333 -1136.8613  -738.3541  -809.4175 8.21e-103        1
## 5 -3999.8471 -1514.3772 -4005.0462 -2273.7404 -2537.6268  0.00e+00        1
## 6  -422.6331 -1018.3409  -424.8764  -599.9499  -659.8594  3.29e-78        1
table(f_demuxlet_edit$DMX_classification.global) #num of singlets and doublets
## 
##   AMB   DBL   SNG 
##    17  2505 15896
table(f_demuxlet_edit$DMX_maxID) #number of cells identified as each donor
## 
## NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334 
##      6750      3559      3609      4500
table(f_demuxlet_edit[,c("DMX_classification.global","DMX_maxID")]) #number of singlets or doublets identified as each donor
##                          DMX_maxID
## DMX_classification.global NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
##                       AMB         5         4         4         4
##                       DBL      1481       610       399        15
##                       SNG      5264      2945      3206      4481

Next, I will add this edited demuxlet to the Seurat object.

f_demuxlet_edit.subset <- f_demuxlet_edit[f_demuxlet_edit$BARCODE %in% colnames(pbmc.f),]

pbmc.f@meta.data <- cbind(pbmc.f@meta.data,f_demuxlet_edit.subset$DMX_maxID,f_demuxlet_edit.subset$DMX_classification.global, f_demuxlet_edit.subset$BARCODE)

head(pbmc.f@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACCAT-1     fibril       7898         2915   4.304887
## AAACCTGAGAAGGACA-1     fibril      10204         3359   5.556644
## AAACCTGAGAATCTCC-1     fibril      15411         4590   2.602038
## AAACCTGAGAATGTGT-1     fibril       8500         2962   4.141176
## AAACCTGAGACGCACA-1     fibril      30572         6417   3.185922
## AAACCTGAGCTGAAAT-1     fibril       7030         2652   3.911807
##                    RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGAAACCAT-1               4               4
## AAACCTGAGAAGGACA-1               1               1
## AAACCTGAGAATCTCC-1               9               9
## AAACCTGAGAATGTGT-1               1               1
## AAACCTGAGACGCACA-1               7               7
## AAACCTGAGCTGAAAT-1               0               0
##                    f_demuxlet_edit.subset$DMX_maxID
## AAACCTGAGAAACCAT-1                        NYUMD0334
## AAACCTGAGAAGGACA-1                        NYUMD0289
## AAACCTGAGAATCTCC-1                        NYUMD0327
## AAACCTGAGAATGTGT-1                        NYUMD0334
## AAACCTGAGACGCACA-1                        NYUMD0334
## AAACCTGAGCTGAAAT-1                        NYUMD0289
##                    f_demuxlet_edit.subset$DMX_classification.global
## AAACCTGAGAAACCAT-1                                              SNG
## AAACCTGAGAAGGACA-1                                              SNG
## AAACCTGAGAATCTCC-1                                              SNG
## AAACCTGAGAATGTGT-1                                              SNG
## AAACCTGAGACGCACA-1                                              SNG
## AAACCTGAGCTGAAAT-1                                              SNG
##                    f_demuxlet_edit.subset$BARCODE
## AAACCTGAGAAACCAT-1             AAACCTGAGAAACCAT-1
## AAACCTGAGAAGGACA-1             AAACCTGAGAAGGACA-1
## AAACCTGAGAATCTCC-1             AAACCTGAGAATCTCC-1
## AAACCTGAGAATGTGT-1             AAACCTGAGAATGTGT-1
## AAACCTGAGACGCACA-1             AAACCTGAGACGCACA-1
## AAACCTGAGCTGAAAT-1             AAACCTGAGCTGAAAT-1

Pre-Processing the new object.

The Seurat Object has already been preprocessed in my case, so this should be clean)

pbmc.f[["percent.mt"]] <- PercentageFeatureSet(pbmc.f, pattern = "^MT-")

library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
## 
##     align_plots
# look at distribution of metrics by classification
plot_grid(VlnPlot(pbmc.f, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "f_demuxlet_edit.subset$DMX_maxID"))

VlnPlot(pbmc.f, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "f_demuxlet_edit.subset$DMX_classification.global")

Now, select for MT content under 10% and for nfeatureRNA > 200 to ensure getting alive cells. I previously did this, so should see no change in cell numnbers.

pbmc.f <- subset(pbmc.f, subset = nFeature_RNA > 200 & percent.mt < 10)
length(colnames(pbmc.f))
## [1] 17726
length(rownames(pbmc.f))
## [1] 23791

Add column to meta data to identify seurat object as Basline condition. Also rename some columns for clarity purposes.

pbmc.f@meta.data$condition <- 'Fibril'
names(pbmc.f@meta.data)[names(pbmc.f@meta.data) == "f_demuxlet_edit.subset$DMX_classification.global"] <- "DMX_classification.global"
names(pbmc.f@meta.data)[names(pbmc.f@meta.data) == "f_demuxlet_edit.subset$DMX_maxID"] <- "DMX_maxID"
names(pbmc.f@meta.data)[names(pbmc.f@meta.data) == "f_demuxlet_edit.subset$BARCODE"] <- "Barcode"
head(pbmc.f@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACCAT-1     fibril       7898         2915   4.304887
## AAACCTGAGAAGGACA-1     fibril      10204         3359   5.556644
## AAACCTGAGAATCTCC-1     fibril      15411         4590   2.602038
## AAACCTGAGAATGTGT-1     fibril       8500         2962   4.141176
## AAACCTGAGACGCACA-1     fibril      30572         6417   3.185922
## AAACCTGAGCTGAAAT-1     fibril       7030         2652   3.911807
##                    RNA_snn_res.0.5 seurat_clusters DMX_maxID
## AAACCTGAGAAACCAT-1               4               4 NYUMD0334
## AAACCTGAGAAGGACA-1               1               1 NYUMD0289
## AAACCTGAGAATCTCC-1               9               9 NYUMD0327
## AAACCTGAGAATGTGT-1               1               1 NYUMD0334
## AAACCTGAGACGCACA-1               7               7 NYUMD0334
## AAACCTGAGCTGAAAT-1               0               0 NYUMD0289
##                    DMX_classification.global            Barcode condition
## AAACCTGAGAAACCAT-1                       SNG AAACCTGAGAAACCAT-1    Fibril
## AAACCTGAGAAGGACA-1                       SNG AAACCTGAGAAGGACA-1    Fibril
## AAACCTGAGAATCTCC-1                       SNG AAACCTGAGAATCTCC-1    Fibril
## AAACCTGAGAATGTGT-1                       SNG AAACCTGAGAATGTGT-1    Fibril
## AAACCTGAGACGCACA-1                       SNG AAACCTGAGACGCACA-1    Fibril
## AAACCTGAGCTGAAAT-1                       SNG AAACCTGAGCTGAAAT-1    Fibril
saveRDS(pbmc.f, file = "/sc/arion/projects/ad-omics/ashley/PD_Stim/pbmc.fibril.final.RDS")